In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Load dataset
df = pd.read_csv("C:/Users/DELL/OneDrive/Desktop/cohort 4/Python/loan_approval_dataset.csv")
# Clean column names
df.columns = df.columns.str.strip()
# Copy the dataset
df_encoded = df.copy()
# Clean and encode 'loan_status'
df_encoded['loan_status'] = df['loan_status'].str.strip().map({
'Approved': 1,
'Rejected': 0
})
# Encode categorical features
le = LabelEncoder()
df_encoded['education'] = le.fit_transform(df_encoded['education'].str.strip())
df_encoded['self_employed'] = le.fit_transform(df_encoded['self_employed'].str.strip())
# Feature engineering
df_encoded['debt_income_ratio'] = df_encoded['loan_amount'] / df_encoded['income_annum']
df_encoded['monthly_emi'] = df_encoded['loan_amount'] / df_encoded['loan_term']
# Define X and y
X = df_encoded.drop(['loan_id', 'loan_status'], axis=1)
y = df_encoded['loan_status']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
# Predict
y_pred = rf_model.predict(X_test)
# Evaluate
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))
✅ Accuracy: 0.9988290398126464
📊 Confusion Matrix:
[[317 1]
[ 0 536]]
📄 Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 318
1 1.00 1.00 1.00 536
accuracy 1.00 854
macro avg 1.00 1.00 1.00 854
weighted avg 1.00 1.00 1.00 854
In [2]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())
Cross-validation scores: [0.9941452 0.99531616 0.99765808 0.99648712 0.99765533] Mean CV accuracy: 0.9962523782983876
In [3]:
import matplotlib.pyplot as plt # Add this line if not already imported
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Features Affecting Loan Approval")
plt.xlabel("Feature Importance")
plt.show()
In [4]:
!pip install plotly
Requirement already satisfied: plotly in c:\users\dell\appdata\local\programs\python\python313\lib\site-packages (6.2.0) Requirement already satisfied: narwhals>=1.15.1 in c:\users\dell\appdata\local\programs\python\python313\lib\site-packages (from plotly) (1.46.0) Requirement already satisfied: packaging in c:\users\dell\appdata\local\programs\python\python313\lib\site-packages (from plotly) (25.0)
In [5]:
import plotly.io as pio
pio.renderers.default = 'notebook' # For classic Jupyter Notebook
In [6]:
import plotly.express as px
fig = px.histogram(df, x="income_annum", color="loan_status",
title="Applicant Income vs Loan Approval Status")
fig.show()
In [8]:
import plotly
print(plotly.__version__)
6.2.0
In [10]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))
fig.show()
In [ ]: